Final Project: Modelling¶
Importing Packages / Libraries / Modules¶
# Basic Data & Visualization Tools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(font_scale = 2)
# Data Import Tools
import sqlite3
# NLP Tools
import spacy
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('punkt')
# Model 1
from textblob import TextBlob
from textblob.classifiers import NaiveBayesClassifier
# Model 2
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, roc_curve, precision_recall_curve, classification_report, confusion
from sklearn.model_selection import learning_curve
# Model 3
# No warnings about setting value on copy of slice
pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', 60)
# Set default font size
plt.rcParams['font.size'] = 24
from IPython.core.pylabtools import figsize
# Imputing missing values
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
# Machine Learning Models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import tree
# LIME for explaining predictions
import lime
import lime.lime_tabular
[nltk_data] Downloading package wordnet to [nltk_data] /Users/markjones/nltk_data... [nltk_data] Package wordnet is already up-to-date! [nltk_data] Downloading package punkt to /Users/markjones/nltk_data... [nltk_data] Package punkt is already up-to-date!
Model 1: TextBlob Naive Bayes Classifier¶
Import the Data we have pre-processed / prepared¶
conn = sqlite3.connect('db/amazon_reviews.db')
# Load the dataset into a DataFrame
df = pd.read_sql('SELECT * FROM Processed_Full_Dataset', conn)
df.head(3)
| index | Id | ProductId | ProfileName | HelpfulnessNumerator | HelpfulnessDenominator | Score | Summary | dataset | clean_review | Amazon_Tag | helpfulness_pct | lemma | nouns | propn | adjectives | verbs | npav | no_tokens | Quality_Sentiment_Score | Price_Sentiment_Score | Overall_Score | PN_Labels | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 1 | B001E4KFG0 | delmartian | 1 | 1 | 5 | Good Quality Dog Food | Test | i have bought several of the vitality canned d... | positive | 1.0 | I have buy several of the vitality can dog foo... | vitality dog food product quality product stew... | several good finicky most | buy can find look process smell appreciate | vitality dog food product quality product stew... | 52.0 | 0.35 | N/A | 0.35 | positive | |
| 1 | 1 | 2 | B00813GRG4 | dll pa | 0 | 0 | 1 | Not as Advertised | Test | product arrived labeled as jumbo salted peanut... | negative | 0.0 | product arrive label as jumbo salt peanut ... ... | product peanut peanut unsalted error vendor pr... | jumbo small sized sure jumbo | arrive label salt intend represent | product peanut peanut unsalted error vendor pr... | 37.0 | N/A | N/A | 0.075 | positive | |
| 2 | 2 | 3 | B000LQOCH0 | Natalia Corres "Natalia Corres" | 1 | 1 | 4 | "Delight" says it all | Test | this is a confection that has been around a fe... | positive | 1.0 | this be a confection that have be around a few... | confection century light citrus nut case filbe... | gelatin heaven c.s . lewis | few pillowy tiny powdered tiny flavorful yummy... | cut coat recommend seduce sell | confection century light citrus nut case filbe... | 114.0 | N/A | N/A | -0.2 | negative |
len(df)
568454
The dataset is too large, we will sample it to get a smaller section to train the Naive Bayes Classifier on
df_nbc = df.sample(frac=0.02, random_state=786)
len(df_nbc)
11369
We want to try the Naive Bayes using the Amazon tag (i.e. whether the score was >=3 it was positive)
x = df_nbc['clean_review']
y = df_nbc['Amazon_Tag']
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=.5)
#create tuples
train = [x for x in zip(x_train,y_train)]
test = [x for x in zip(x_test, y_test)]
Train the model using a Naive Bayes classifier¶
#Train the classifier on the cleaned corpus against the labelled positive/negative Tag
classifier = NaiveBayesClassifier(train)
Test the TextBlob Naive Bayes Classifier Model¶
(this will be used as our benchmark)
#test the accuracy
print(classifier.accuracy(test))
0.8504837291116975
Model 2: Logistic Regression using the TD-IDF Vectors¶
# TF-IDF vectorization of the text data
tfidf_vectorizer = TfidfVectorizer()
x_train_tfidf = tfidf_vectorizer.fit_transform(x_train)
x_test_tfidf = tfidf_vectorizer.transform(x_test)
# Example: Logistic Regression
model = LogisticRegression()
model.fit(x_train_tfidf, y_train)
# Predictions on the actual test set
y_pred = model.predict(x_test_tfidf)
unique_labels = np.unique(y_test)
print("Unique labels in y_test:", unique_labels)
Unique labels in y_test: ['negative' 'positive']
#print("Actual Labels (y_test):", np.unique(y_test))
#print("Predicted Labels (y_pred):", np.unique(y_pred))
print(len(y_test))
print(len(y_pred))
5685 5685
# Evaluate precision, recall, and F1-score
precision = precision_score(y_test, y_pred, pos_label='positive')
recall = recall_score(y_test, y_pred, pos_label='positive')
f1 = f1_score(y_test, y_pred, pos_label='positive')
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")
# Plot ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(x_test_tfidf)[:, 1], pos_label='positive')
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label="ROC Curve")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.legend()
plt.show()
# Plot Precision-Recall Curve
precision, recall, _ = precision_recall_curve(y_test, model.predict_proba(x_test_tfidf)[:, 1], pos_label='positive')
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, label="Precision-Recall Curve")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.legend()
plt.show()
# Plot Learning Curve
train_sizes, train_scores, test_scores = learning_curve(
model, x_train_tfidf, y_train, cv=5, scoring='accuracy', train_sizes=np.linspace(0.1, 1.0, 10)
)
plt.figure(figsize=(8, 6))
plt.plot(train_sizes, np.mean(train_scores, axis=1), label="Training Accuracy")
plt.plot(train_sizes, np.mean(test_scores, axis=1), label="Testing Accuracy")
plt.xlabel("Training Examples")
plt.ylabel("Accuracy")
plt.title("Learning Curve")
plt.legend()
plt.show()
Precision: 0.8661 Recall: 0.9961 F1-score: 0.9266
print(classification_report(y_test, y_pred))
precision recall f1-score support
negative 0.87 0.14 0.25 866
positive 0.87 1.00 0.93 4819
accuracy 0.87 5685
macro avg 0.87 0.57 0.59 5685
weighted avg 0.87 0.87 0.82 5685
The above Recall for Negative shows that it is not performing well. This is likely due to the dataset being unbalanced.
df.dtypes
index int64 Id int64 ProductId object ProfileName object HelpfulnessNumerator int64 HelpfulnessDenominator int64 Score int64 Summary object dataset object clean_review object Amazon_Tag object helpfulness_pct float64 lemma object nouns object propn object adjectives object verbs object npav object no_tokens float64 Quality_Sentiment_Score object Price_Sentiment_Score object Overall_Score object PN_Labels object dtype: object
print(len(df))
568454
df['Quality_Sentiment_Score'] = (
pd.to_numeric(df['Quality_Sentiment_Score'],
errors='coerce')
.fillna(0)
)
df['Price_Sentiment_Score'] = (
pd.to_numeric(df['Price_Sentiment_Score'],
errors='coerce')
.fillna(0)
)
df['Quality_Sentiment_Score'] = (
pd.to_numeric(df['Overall_Score'],
errors='coerce')
.fillna(0)
)
df.head(3)
| index | Id | ProductId | ProfileName | HelpfulnessNumerator | HelpfulnessDenominator | Score | Summary | dataset | clean_review | Amazon_Tag | helpfulness_pct | lemma | nouns | propn | adjectives | verbs | npav | no_tokens | Quality_Sentiment_Score | Price_Sentiment_Score | Overall_Score | PN_Labels | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 1 | B001E4KFG0 | delmartian | 1 | 1 | 5 | Good Quality Dog Food | Test | i have bought several of the vitality canned d... | positive | 1.0 | I have buy several of the vitality can dog foo... | vitality dog food product quality product stew... | several good finicky most | buy can find look process smell appreciate | vitality dog food product quality product stew... | 52.0 | 0.350 | 0.0 | 0.35 | positive | |
| 1 | 1 | 2 | B00813GRG4 | dll pa | 0 | 0 | 1 | Not as Advertised | Test | product arrived labeled as jumbo salted peanut... | negative | 0.0 | product arrive label as jumbo salt peanut ... ... | product peanut peanut unsalted error vendor pr... | jumbo small sized sure jumbo | arrive label salt intend represent | product peanut peanut unsalted error vendor pr... | 37.0 | 0.075 | 0.0 | 0.075 | positive | |
| 2 | 2 | 3 | B000LQOCH0 | Natalia Corres "Natalia Corres" | 1 | 1 | 4 | "Delight" says it all | Test | this is a confection that has been around a fe... | positive | 1.0 | this be a confection that have be around a few... | confection century light citrus nut case filbe... | gelatin heaven c.s . lewis | few pillowy tiny powdered tiny flavorful yummy... | cut coat recommend seduce sell | confection century light citrus nut case filbe... | 114.0 | -0.200 | 0.0 | -0.2 | negative |
features_keep = 'helpfulness_pct', 'no_tokens', 'Quality_Sentiment_Score', 'Price_Sentiment_Score', 'Overall_Score'
label_keep = 'Amazon_Tag'
x = df.drop(columns=[col for col in df if col not in features_keep])
x.head(3)
| helpfulness_pct | no_tokens | Quality_Sentiment_Score | Price_Sentiment_Score | Overall_Score | |
|---|---|---|---|---|---|
| 0 | 1.0 | 52.0 | 0.350 | 0.0 | 0.35 |
| 1 | 0.0 | 37.0 | 0.075 | 0.0 | 0.075 |
| 2 | 1.0 | 114.0 | -0.200 | 0.0 | -0.2 |
y = df.drop(columns=[col for col in df if col not in label_keep])
y.head(3)
| Amazon_Tag | |
|---|---|
| 0 | positive |
| 1 | negative |
| 2 | positive |
y['Amazon_Tag'].replace({'positive': 1, 'negative': 0}, inplace=True)
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=.5)
# Create an imputer object with a median filling strategy
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
# Train on the training features
imputer.fit(x_train)
SimpleImputer(strategy='median')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SimpleImputer(strategy='median')
# Transform both training data and testing data
x_train = imputer.transform(x_train)
x_test = imputer.transform(x_test)
# Sklearn wants the labels as one-dimensional vectors
y_train = np.array(y_train).reshape((-1,))
y_test = np.array(y_test).reshape((-1,))
# Function to calculate mean absolute error
def mae(y_true, y_pred):
return np.mean(abs(y_true - y_pred))
model = GradientBoostingRegressor(loss='squared_error', max_depth=5, max_features=None,
min_samples_leaf=6, min_samples_split=6,
n_estimators=800, random_state=42)
model.fit(x_train, y_train)
GradientBoostingRegressor(max_depth=5, min_samples_leaf=6, min_samples_split=6,
n_estimators=800, random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GradientBoostingRegressor(max_depth=5, min_samples_leaf=6, min_samples_split=6,
n_estimators=800, random_state=42)# Make predictions on the test set
model_pred = model.predict(x_test)
print('Final Model Performance on the test set: MAE = %0.4f' % mae(y_test, model_pred))
Final Model Performance on the test set: MAE = 0.2200
# Extract the feature importances into a dataframe
feature_results = pd.DataFrame({'feature': list(x.columns),
'importance': model.feature_importances_})
# Show the top 10 most important
feature_results = feature_results.sort_values('importance', ascending = False).reset_index(drop=True)
feature_results.head(10)
| feature | importance | |
|---|---|---|
| 0 | Overall_Score | 0.427683 |
| 1 | Quality_Sentiment_Score | 0.242368 |
| 2 | no_tokens | 0.198608 |
| 3 | helpfulness_pct | 0.113719 |
| 4 | Price_Sentiment_Score | 0.017623 |
figsize(12, 10)
plt.style.use('fivethirtyeight')
# Plot the 10 most important features in a horizontal bar chart
feature_results.loc[:9, :].plot(x = 'feature', y = 'importance',
edgecolor = 'k',
kind='barh', color = 'blue');
plt.xlabel('Relative Importance', size = 20); plt.ylabel('')
plt.title('Feature Importances from Random Forest', size = 30);
# Extract the names of the most important features
most_important_features = feature_results['feature'][:10]
# Find the index that corresponds to each feature name
indices = [list(x.columns).index(x_train) for x_train in most_important_features]
# Keep only the most important features
x_reduced = x_train[:, indices]
x_test_reduced = x_test[:, indices]
print('Most important training features shape: ', x_reduced.shape)
print('Most important testing features shape: ', x_test_reduced.shape)
Most important training features shape: (284227, 5) Most important testing features shape: (284227, 5)
lr = LinearRegression()
# Fit on full set of features
lr.fit(x_train, y_train)
lr_full_pred = lr.predict(x_test)
# Fit on reduced set of features
lr.fit(x_reduced, y_train)
lr_reduced_pred = lr.predict(x_test_reduced)
# Display results
print('Linear Regression Full Results: MAE = %0.4f.' % mae(y_test, lr_full_pred))
print('Linear Regression Reduced Results: MAE = %0.4f.' % mae(y_test, lr_reduced_pred))
Linear Regression Full Results: MAE = 0.2351. Linear Regression Reduced Results: MAE = 0.2351.
# Create the model with the same hyperparamters
model_reduced = GradientBoostingRegressor(loss='squared_error', max_depth=5, max_features=None,
min_samples_leaf=6, min_samples_split=6,
n_estimators=800, random_state=42)
# Fit and test on the reduced set of features
model_reduced.fit(x_reduced, y_train)
model_reduced_pred = model_reduced.predict(x_test_reduced)
print('Gradient Boosted Reduced Results: MAE = %0.4f' % mae(y_test, model_reduced_pred))
Gradient Boosted Reduced Results: MAE = 0.2200
The above shows that the Gradient Boosted Reduced Results did not change from the first attempt at Gradient Boosting.
Model 4: Locally Interpretable Model-agnostic Explanations (LIME)¶
Following Week 6 Tutorial: Model Interpretation
# Find the residuals
residuals = abs(model_reduced_pred - y_test)
# Exact the worst and best prediction
wrong = x_test_reduced[np.argmax(residuals), :]
right = x_test_reduced[np.argmin(residuals), :]
# Create a lime explainer object
explainer = lime.lime_tabular.LimeTabularExplainer(training_data = x_reduced,
mode = 'regression',
training_labels = y_train,
feature_names = list(most_important_features))
Here is an example from the LIME model that was predicted incorrectly.¶
We can see the reasons why this was predicted wrong
# Display the predicted and true value for the wrong instance
print('Prediction: %0.4f' % model_reduced.predict(wrong.reshape(1, -1)))
print('Actual Value: %0.4f' % y_test[np.argmax(residuals)])
# Explanation for wrong prediction
wrong_exp = explainer.explain_instance(data_row = wrong,
predict_fn = model_reduced.predict)
# Plot the prediction explaination
wrong_exp.as_pyplot_figure();
plt.title('Explanation of Prediction', size = 28);
plt.xlabel('Effect on Prediction', size = 22);
/var/folders/jz/06jb_cbd77v_7jmcq3q91s9m0000gn/T/ipykernel_96306/2438247814.py:2: DeprecationWarning: Conversion of an array with ndim > 0 to a scalar is deprecated, and will error in future. Ensure you extract a single element from your array before performing this operation. (Deprecated NumPy 1.25.)
print('Prediction: %0.4f' % model_reduced.predict(wrong.reshape(1, -1)))
Prediction: 1.0847 Actual Value: 0.0000
wrong_exp.show_in_notebook(show_predicted_value=False)
Here is an example from the LIME model that was predicted Correct.¶
We can see the reasons why this was predicted correct
# Display the predicted and true value for the wrong instance
print('Prediction: %0.4f' % model_reduced.predict(right.reshape(1, -1)))
print('Actual Value: %0.4f' % y_test[np.argmin(residuals)])
# Explanation for wrong prediction
right_exp = explainer.explain_instance(right, model_reduced.predict, num_features=10)
right_exp.as_pyplot_figure();
plt.title('Explanation of Prediction', size = 28);
plt.xlabel('Effect on Prediction', size = 22);
/var/folders/jz/06jb_cbd77v_7jmcq3q91s9m0000gn/T/ipykernel_96306/2277720504.py:2: DeprecationWarning: Conversion of an array with ndim > 0 to a scalar is deprecated, and will error in future. Ensure you extract a single element from your array before performing this operation. (Deprecated NumPy 1.25.)
print('Prediction: %0.4f' % model_reduced.predict(right.reshape(1, -1)))
Prediction: 1.0000 Actual Value: 1.0000
right_exp.show_in_notebook(show_predicted_value=False)
# Extract a single tree
single_tree = model_reduced.estimators_[105][0]
tree.export_graphviz(single_tree, out_file = 'images/amazon_tree.dot',
rounded = True,
feature_names = most_important_features,
filled = True)
single_tree
DecisionTreeRegressor(criterion='friedman_mse', max_depth=5, min_samples_leaf=6,
min_samples_split=6,
random_state=RandomState(MT19937) at 0x5D3C66940)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeRegressor(criterion='friedman_mse', max_depth=5, min_samples_leaf=6,
min_samples_split=6,
random_state=RandomState(MT19937) at 0x5D3C66940)!dot -Tpng images/amazon_tree.dot -o images/amazon_tree.png
from IPython.display import Image
Image(filename='images/amazon_tree.png')
tree.export_graphviz(single_tree, out_file = 'images/amazon_tree_small.dot',
rounded = True, feature_names = most_important_features,
filled = True, max_depth = 3)
!dot -Tpng images/amazon_tree_small.dot -o images/amazon_tree_small.png
Image(filename='images/amazon_tree_small.png')